This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

Report

The report is the knitted version of the current document (this Rmd).

Category Unsatisfactory Satisfactory
Effort Some task q’s left unattempted All task q’s attempted
Observed Did not document observations, or observations incorrect Documented correct observations based on analysis
Supported Some observations not supported by analysis, or errors in analysis All observations clearly and correctly supported by analysis (table, graph, etc.)
Assessed Observations include claims not supported by the data, or reflect a level of certainty not warranted by the data Observations are appropriately qualified by the quality & relevance of the data and the (in)conclusiveness of the Support
Code Styled Violations of the style guide hinder readability Code sufficiently close to the style guide

Setup

library(tidyverse)
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## NOTE: No need to edit; feel free to re-use this code!
theme_common <- function() {
  theme_minimal() %+replace%
  theme(
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12),
    axis.title.x = element_text(margin = margin(4, 4, 4, 4), size = 16),
    axis.title.y = element_text(margin = margin(4, 4, 4, 4), size = 16, angle = 90),

    legend.title = element_text(size = 16),
    legend.text = element_text(size = 12),

    strip.text.x = element_text(size = 12),
    strip.text.y = element_text(size = 12),

    panel.grid.major = element_line(color = "grey90"),
    panel.grid.minor = element_line(color = "grey90"),

    aspect.ratio = 4 / 4,

    plot.margin = unit(c(t = +0, b = +0, r = +0, l = +0), "cm"),
    plot.title = element_text(size = 18),
    plot.title.position = "plot",
    plot.subtitle = element_text(size = 16),
    plot.caption = element_text(size = 12)
  )
}

What question did you set out to answer?:

Is there a correlation between coronavirus cases and the food prices within the United States?

What data did you find to help answer that question?

What is the relevant background on your question?

mw <- read.csv("data/midwest_bls_cpi.csv", skip = 3, header = TRUE) 
ne = read.csv("data/northeast_bls_cpi.csv", skip = 3, header = TRUE)
s = read.csv("data/south_bls_cpi.csv", skip = 3, header = TRUE)
w = read.csv("data/west_bls_cpi.csv", skip = 3, header = TRUE)
data = rbind(mw, ne, s, w)
area_codes = read.csv("data/area_codes.csv")
item_codes = read.csv("data/item_codes.csv", sep ="\t")

months = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
mon = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
# mon_factor = factor(mon, mon, ordered = TRUE)
data_split <- data %>% 
  separate(
    col = 1,
    into = c("AP", "Seasonal Adjust", "area", "item"), 
    sep = c(2, 3, 7)
  ) %>% 
  mutate(
    item = replace(item, item == "712111", "712112")
  ) %>%
  merge(
    item_codes, by.x = "item", by.y = "item_code"
  ) %>%
  merge(
    area_codes %>% select(-X), by.x = "area", by.y = "area_code"
  ) %>% 
  pivot_longer(
    cols = (-c("AP", "Seasonal Adjust", "area", "item", "item_name", "area_name")),
    names_to = "date",
    values_to = "price"
  ) %>% 
  separate(
    col = "date",
    into = c("month", "year"),
    sep = "_"
  ) %>% 
  mutate(
    year = as.integer(year),
    month = factor(month, mon, ordered = TRUE)
  ) 

data_split
data_split %>%
  filter(
    item == '712112',
    year == 2018 | year == 2019 | year == 2020
  ) %>%
  ggplot(aes(x = month, y = price, color = area_name)) +
  geom_point() +
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  facet_grid(~year) +
  ggtitle("Price of Potatos by Region")
## Warning: Removed 166 rows containing missing values (geom_point).

# Import the list of states with what region they belong to
states <- read_csv("data/states_with_regions.csv")
## Parsed with column specification:
## cols(
##   state = col_character(),
##   region = col_character()
## )
# Get the live data from the NYT repo
url_state <- 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv'
filename_nyt <- "./data/us_states.csv"

curl::curl_download(
        url_state,
        destfile = filename_nyt
      )

df_covid <- read_csv(filename_nyt) %>%
  separate( #Separate YYYY-MM-DD into 3 columns
    col = date,
    sep = '-',
    into = c("year", "month", "day")
  ) %>% 
  mutate( #Make year month day into integers
    year = as.integer(year),
    month = as.integer(month),
    day = as.integer(day)
  ) %>%
  group_by(state, year, month, fips) %>% #Group by these values to keep them
  summarize( #Summarize to get total cases over each month in each state
    cases = max(cases, na.rm = TRUE),
    deaths = max(deaths, na.rm = TRUE)
  ) %>%
  left_join(states, by = "state") #Add in the region for this state
## Parsed with column specification:
## cols(
##   date = col_date(format = ""),
##   state = col_character(),
##   fips = col_character(),
##   cases = col_double(),
##   deaths = col_double()
## )
## `summarise()` regrouping output by 'state', 'year', 'month' (override with `.groups` argument)
df_covid
# Read the population data
filename_population <- "./data/ACSDT5Y2018.B01003_data_with_overlays_2020-10-22T174815.csv"

df_pop <- read_csv(filename_population, skip = 1) %>%
  left_join(states, by = "state") %>% #get region
  select( #rename population and get rid of unnecessary ubfi
    state,
    population_2018 = `Estimate!!Total`,
    region = region
  ) %>%
  group_by(region) %>%
  summarize( #get the population for each region
    population = sum(population_2018, na.rm = TRUE)
  ) %>%
  filter(is.na(region) == FALSE) #get rid of the NA region that was from the row of state = United States
## Parsed with column specification:
## cols(
##   id = col_character(),
##   state = col_character(),
##   `Estimate!!Total` = col_double(),
##   `Margin of Error!!Total` = col_character()
## )
## `summarise()` ungrouping output (override with `.groups` argument)
# Add the population data to the COVID dataframe
df_covid_total <- df_covid %>%
  mutate( #Change numeric month into mon, the same format as the price data
    month = factor(month, levels = c(1,2,3,4,5,6,7,8,9,10,11,12), labels = mon, ordered = TRUE),
    .keep = "unused"
  ) %>%
  group_by(year, month, region) %>%
  summarize(
    cases = sum(cases, na.rm = TRUE),
    deaths = sum(deaths, na.rm = TRUE)
  ) %>%
  filter(is.na(region) == FALSE)
## `summarise()` regrouping output by 'year', 'month' (override with `.groups` argument)
#%>%
  #left_join(df_pop, by = "region") #Add the population for each region

df_covid_total
df_covid_total %>% #plot the covid data just to take a quick look and make sure it doesn't look totally off
  ggplot() +
  geom_line(aes(x = month, y = cases, color = region, group = region))

full_dataset <- data_split %>% 
  left_join(df_covid_total, by = c("area_name" = "region", "month", "year")) %>% #merge the covid dataset into the price dataset
  left_join(df_pop, by = c("area_name" = "region")) %>%
  mutate(
    cases = cases - lag(cases),
    deaths = deaths - lag(deaths)
  ) %>%
  mutate(
    cases_per100k = (cases/population) * 100000,
    deaths_per100k = (deaths/population) * 100000
  )
full_dataset
coeff <- 0.001

full_dataset %>%
  filter(
    item == 'FD3101',
    year == 2020
  ) %>%
  ggplot() +
  geom_point(aes(x = month, y = cases_per100k, color = area_name)) +
  geom_line(aes(x = month, y = cases_per100k, color = area_name, group = area_name)) +
  geom_point(aes(x = month, y = price / coeff, color = area_name), shape  = 2) +
  geom_line(aes(x = month, y = price / coeff, color = area_name, group = area_name)) +
  scale_y_continuous(
    name = "cases",
    sec.axis = sec_axis(~.*coeff, name="price")
  ) + 
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  facet_grid(~year) +
  ggtitle("Price of Potatos by Region")
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 14 row(s) containing missing values (geom_path).
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 12 row(s) containing missing values (geom_path).

coeff <- 0.000002

full_dataset %>%
  filter(
    item == 'FD3101',
    area_name == 'Midwest',
    year == 2020
  ) %>%
  ggplot() +
  geom_point(aes(x = month, y = cases,), shape = 1) +
  geom_point(aes(x = month, y = price/coeff)) +
  scale_y_continuous(
    name = "Number of Cases",
    sec.axis = sec_axis(~.*coeff, name="Price per Pound ($)")
  ) +
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  facet_grid(~ year) +
  ggtitle("Price of All Pork Chop vs Number of Coronavirus Cases in the Midwest")
## Warning: Removed 3 rows containing missing values (geom_point).

## Warning: Removed 3 rows containing missing values (geom_point).

Presentation Plots:

coeff <- 0.003
full_dataset %>%
  mutate(
    price = (price - 2) / coeff
  ) %>%
 # pivot_longer(
    #cols = c(cases, price, cases_per100k),
    #names_to = "variable",
    #values_to = "value"
  #) %>%
  filter(
    item == 'FD3101',
    # area_name == 'Midwest',
    year >= 2017,
    #variable == "cases_per100k" | variable == "price"
  ) %>%
  mutate(
    year_code = ifelse(year < 2020, "Previous Year Prices", "Price 2020")
    # year = as.character(year)
  ) %>%
  ggplot(aes(x = month)) +
  # geom_point(aes(y = price, color = paste(year, " Price"))) +
  # geom_line(aes(y = price, color = paste(year, " Price"), group = year)) +
  geom_point(aes(y = price, color = year_code)) +
  geom_line(aes(y = price, color = year_code, group = year)) +
  scale_color_manual(
    values = c("Previous Year Prices" = "grey", "Price 2020" = "blue", "COVID Cases Per 100k" = "black"), 
    name = "Legend",
    breaks = c("Previous Year Prices", "Price 2020", "COVID Cases Per 100k")) +
  geom_point(aes(y = cases_per100k)) + 
  geom_line(aes(y = cases_per100k, group = year, color = "COVID Cases Per 100k")) +
  scale_y_continuous(
    name = "Cases Per 100k",
    sec.axis = sec_axis(~.*coeff + 2, name="Price ($USD)")
  ) +
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.text.y = element_text(size = 10, angle = 270),
        aspect.ratio = 0.5,
        # legend.title=element_blank()
        ) +
  facet_wrap(~ area_name) +
  xlab("Month") +
  ggtitle("Pork Price (per lb) and COVID Cases by Region")
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 3 row(s) containing missing values (geom_path).
## Warning: Removed 158 rows containing missing values (geom_point).
## Warning: Removed 147 row(s) containing missing values (geom_path).

Notes:

The purpose of this graph was to illustrate the general trends of Pork Chop prices by pound in the four US regions over time. It should be noted that there are gaps in the Northeast Porkchop prices in April. Generally the prices of pork chops are elevated in all regions from where they initially quite close together in January. Data from 2019 pork prices were included to show typical monthly variance in price by region.

coeff <- 0.003
full_dataset %>%
  mutate(
    price = (price - 2) / coeff
  ) %>%
 # pivot_longer(
    #cols = c(cases, price, cases_per100k),
    #names_to = "variable",
    #values_to = "value"
  #) %>%
  filter(
    item == 'FC1101',
    # area_name == 'Midwest',
    year >= 2017,
    #variable == "cases_per100k" | variable == "price"
  ) %>%
  mutate(
    year_code = ifelse(year < 2020, "Previous Year Prices", "Price 2020")
    # year = as.character(year)
  ) %>%
  ggplot(aes(x = month)) +
  # geom_point(aes(y = price, color = paste(year, " Price"))) +
  # geom_line(aes(y = price, color = paste(year, " Price"), group = year)) +
  geom_point(aes(y = price, color = year_code)) +
  geom_line(aes(y = price, color = year_code, group = year)) +
  scale_color_manual(
    values = c("Previous Year Prices" = "grey", "Price 2020" = "blue", "COVID Cases Per 100k" = "black"), 
    name = "Legend",
    breaks = c("Previous Year Prices", "Price 2020", "COVID Cases Per 100k")) +
  geom_point(aes(y = cases_per100k)) + 
  geom_line(aes(y = cases_per100k, group = year, color = "COVID Cases Per 100k")) +
  scale_y_continuous(
    name = "Cases Per 100k",
    sec.axis = sec_axis(~.*coeff + 2, name="Price ($USD)")
  ) +
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.text.y = element_text(size = 10, angle = 270),
        aspect.ratio = 0.5,
        # legend.title=element_blank()
        ) +
  facet_wrap(~ area_name) +
  xlab("Month") +
  ggtitle("Ground Beef Price (per lb) and COVID Cases by Region")
## Warning: Removed 14 rows containing missing values (geom_point).
## Warning: Removed 3 row(s) containing missing values (geom_path).
## Warning: Removed 158 rows containing missing values (geom_point).
## Warning: Removed 147 row(s) containing missing values (geom_path).

Notes:

Within this figure, we can see increases in price for beef in all regions around June. This also appears to loosely correspond to when coronavirus cases began to increase mostly in the Northeast and South. The time of the spike in the Northeast appears to roughly equate to the time in which the price of ground beef increased in price throughout all regions within the United States. Interestingly it appears that the price of beef in the midwest was highest when coronavirus cases spiked downward for one month.

coeff <- 0.003
full_dataset %>%
  mutate(
    price = (price - 2) / coeff
  ) %>%
 # pivot_longer(
    #cols = c(cases, price, cases_per100k),
    #names_to = "variable",
    #values_to = "value"
  #) %>%
  filter(
    item == 'FF1101',
    # area_name == 'Midwest',
    year >= 2017,
    #variable == "cases_per100k" | variable == "price"
  ) %>%
  mutate(
    year_code = ifelse(year < 2020, "Previous Year Prices", "Price 2020")
    # year = as.character(year)
  ) %>%
  ggplot(aes(x = month)) +
  # geom_point(aes(y = price, color = paste(year, " Price"))) +
  # geom_line(aes(y = price, color = paste(year, " Price"), group = year)) +
  geom_point(aes(y = price, color = year_code)) +
  geom_line(aes(y = price, color = year_code, group = year)) +
  scale_color_manual(
    values = c("Previous Year Prices" = "grey", "Price 2020" = "blue", "COVID Cases Per 100k" = "black"), 
    name = "Legend",
    breaks = c("Previous Year Prices", "Price 2020", "COVID Cases Per 100k")) +
  geom_point(aes(y = cases_per100k)) + 
  geom_line(aes(y = cases_per100k, group = year, color = "COVID Cases Per 100k")) +
  scale_y_continuous(
    name = "Cases Per 100k",
    sec.axis = sec_axis(~.*coeff + 2, name="Price ($USD)")
  ) +
  theme_common() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1),
        strip.text.y = element_text(size = 10, angle = 270),
        aspect.ratio = 0.5,
        # legend.title=element_blank()
        ) +
  facet_wrap(~ area_name) +
  xlab("Month") +
  ggtitle("Chicken Breast Price (per lb) and COVID Cases by Region")
## Warning: Removed 15 rows containing missing values (geom_point).
## Warning: Removed 3 row(s) containing missing values (geom_path).
## Warning: Removed 158 rows containing missing values (geom_point).
## Warning: Removed 147 row(s) containing missing values (geom_path).

Notes:

Within this figure, we can see that the price of chicken breast remained much more steady than prices of other items we previously looked at. There appears to be a slight increase in price, which is most noticeable in the Midwest and the Northeast, but for the most part it remained steady. Increasingly there appears to be missing data during April and May for the West and just in May for the Northeast. However, the surrounding months do not appear to fluctuate like they had for other types of meats.

Dataset Commentary:

Conclusions:

Further Questions:

  • What happened to the missing data? It’s unlikely the northeast just stopped consuming chicken, so why is there no data on it since last October? Other items also have similar gaps.